df <- read.csv("C:/Users/tbawaskar/Documents/NYC/EDA/nyc_taxi.csv")
nrow(df) #number of rows in the dataset## [1] 1494926
ncol(df) #number of columns in the dataset## [1] 21
colnames(df) #column names## [1] "VendorID" "lpep_pickup_datetime"
## [3] "Lpep_dropoff_datetime" "Store_and_fwd_flag"
## [5] "RateCodeID" "Pickup_longitude"
## [7] "Pickup_latitude" "Dropoff_longitude"
## [9] "Dropoff_latitude" "Passenger_count"
## [11] "Trip_distance" "Fare_amount"
## [13] "Extra" "MTA_tax"
## [15] "Tip_amount" "Tolls_amount"
## [17] "Ehail_fee" "improvement_surcharge"
## [19] "Total_amount" "Payment_type"
## [21] "Trip_type"
summary(df) #summary of the dataset## VendorID lpep_pickup_datetime
## Min. :1.000 2015-09-20 02:00:32: 9
## 1st Qu.:2.000 2015-09-05 14:57:48: 8
## Median :2.000 2015-09-10 17:43:49: 8
## Mean :1.782 2015-09-13 00:27:28: 8
## 3rd Qu.:2.000 2015-09-13 01:06:29: 8
## Max. :2.000 2015-09-26 22:48:40: 8
## (Other) :1494877
## Lpep_dropoff_datetime Store_and_fwd_flag RateCodeID
## 2015-09-28 00:00:00: 172 N:1486192 Min. : 1.000
## 2015-09-13 00:00:00: 153 Y: 8734 1st Qu.: 1.000
## 2015-09-19 00:00:00: 141 Median : 1.000
## 2015-09-14 00:00:00: 126 Mean : 1.098
## 2015-09-21 00:00:00: 125 3rd Qu.: 1.000
## 2015-09-12 00:00:00: 119 Max. :99.000
## (Other) :1494090
## Pickup_longitude Pickup_latitude Dropoff_longitude Dropoff_latitude
## Min. :-83.32 Min. : 0.00 Min. :-83.43 Min. : 0.00
## 1st Qu.:-73.96 1st Qu.:40.70 1st Qu.:-73.97 1st Qu.:40.70
## Median :-73.95 Median :40.75 Median :-73.95 Median :40.75
## Mean :-73.83 Mean :40.69 Mean :-73.84 Mean :40.69
## 3rd Qu.:-73.92 3rd Qu.:40.80 3rd Qu.:-73.91 3rd Qu.:40.79
## Max. : 0.00 Max. :43.18 Max. : 0.00 Max. :42.80
##
## Passenger_count Trip_distance Fare_amount Extra
## Min. :0.000 Min. : 0.000 Min. :-475.00 Min. :-1.0000
## 1st Qu.:1.000 1st Qu.: 1.100 1st Qu.: 6.50 1st Qu.: 0.0000
## Median :1.000 Median : 1.980 Median : 9.50 Median : 0.5000
## Mean :1.371 Mean : 2.968 Mean : 12.54 Mean : 0.3513
## 3rd Qu.:1.000 3rd Qu.: 3.740 3rd Qu.: 15.50 3rd Qu.: 0.5000
## Max. :9.000 Max. :603.100 Max. : 580.50 Max. :12.0000
##
## MTA_tax Tip_amount Tolls_amount Ehail_fee
## Min. :-0.5000 Min. :-50.000 Min. :-15.2900 Mode:logical
## 1st Qu.: 0.5000 1st Qu.: 0.000 1st Qu.: 0.0000 NA's:1494926
## Median : 0.5000 Median : 0.000 Median : 0.0000
## Mean : 0.4866 Mean : 1.236 Mean : 0.1231
## 3rd Qu.: 0.5000 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. : 0.5000 Max. :300.000 Max. : 95.7500
##
## improvement_surcharge Total_amount Payment_type Trip_type
## Min. :-0.3000 Min. :-475.00 Min. :1.000 Min. :1.000
## 1st Qu.: 0.3000 1st Qu.: 8.16 1st Qu.:1.000 1st Qu.:1.000
## Median : 0.3000 Median : 11.76 Median :2.000 Median :1.000
## Mean : 0.2921 Mean : 15.03 Mean :1.541 Mean :1.022
## 3rd Qu.: 0.3000 3rd Qu.: 18.30 3rd Qu.:2.000 3rd Qu.:1.000
## Max. : 0.3000 Max. : 581.30 Max. :5.000 Max. :2.000
## NA's :4
# checking for NA
colSums(is.na(df[,]))## VendorID lpep_pickup_datetime Lpep_dropoff_datetime
## 0 0 0
## Store_and_fwd_flag RateCodeID Pickup_longitude
## 0 0 0
## Pickup_latitude Dropoff_longitude Dropoff_latitude
## 0 0 0
## Passenger_count Trip_distance Fare_amount
## 0 0 0
## Extra MTA_tax Tip_amount
## 0 0 0
## Tolls_amount Ehail_fee improvement_surcharge
## 0 1494926 0
## Total_amount Payment_type Trip_type
## 0 0 4
# We can see Ehail_fee has no data hence we will eliminate it
df <- subset(df,select = -c(Ehail_fee))
# We can also see that only 4 observations are missing in Trip_type of 1.49 million observations
df <- df[complete.cases(df),]
# Checking the datatypes of each variable in the dataframe
sapply(df, class)## VendorID lpep_pickup_datetime Lpep_dropoff_datetime
## "integer" "factor" "factor"
## Store_and_fwd_flag RateCodeID Pickup_longitude
## "factor" "integer" "numeric"
## Pickup_latitude Dropoff_longitude Dropoff_latitude
## "numeric" "numeric" "numeric"
## Passenger_count Trip_distance Fare_amount
## "integer" "numeric" "numeric"
## Extra MTA_tax Tip_amount
## "numeric" "numeric" "numeric"
## Tolls_amount improvement_surcharge Total_amount
## "numeric" "numeric" "numeric"
## Payment_type Trip_type
## "integer" "integer"
*The values were normalized between 0 and 1, as unscaled variables didn’t give a proper a visualization.
# variables were then scaled to better understand their behavior
nums <- sapply(df, is.numeric)
num.df <- df[ , nums]
norm.df <- normalize(num.df, method = 'range', range = c(0,1))
#boxplot(norm.df, names=colnames(norm.df), las = 2)
ggplot(stack(norm.df), aes(x = ind, y = values)) +
geom_boxplot() + theme(axis.text.x =
element_text(size = 10,
angle = 45,
hjust = 1,
vjust = 1)) + labs (x = 'Variables',
y = 'Values',
title = 'Boxplot of all Variables')rm(num.df)
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4347860 232.3 7790168 416.1 6136393 327.8
## Vcells 102689523 783.5 363061740 2770.0 453827176 3462.5
#Lets look at Trip Distance as it shows a high percentage of outliers
hist(df$Trip_distance, breaks = 5000, col = 'black',xlab = 'Trip Distance'
,main = 'Histogram of Trip distance vs frequency')hist (df$Trip_distance
,breaks = 5000
,col = "blue"
,xlab = 'Trip Distance'
,main = 'Trip distance Frequency'
,xlim = c(0,mean(df$Trip_distance) + 3*sd(df$Trip_distance))
,freq = TRUE)*It can be pointed out that there is an anomaly seen in the graph. There are number of rides where the distance covered is zero which is incorrect. We thus need to clean the data through for better predictions/results.
plyr::count(df, 'RateCodeID')# Upon analysis it was found that ratecode id 99 was an error
df <- subset(df,df$RateCodeID < 7)
ggplot(df, aes(y = Tip_amount, x = factor(RateCodeID), fill = factor(RateCodeID))) +
theme_light() +
stat_summary(fun.y = mean, # calc mean of all observations for the month
geom = "bar") +
scale_color_manual('') +
labs(x = 'Rate Code ID', y = 'Mean Tip', fill = "ID's")*The graph plotted above is a filtered dataset with observations that have payment type as Credit Card. Later, it was revealled that other payment types had an anamoly that needs to be fixed.
t.test(df[((df$RateCodeID == 1) | (df$RateCodeID == 5) | (df$RateCodeID == 6)),]$Tip_amount,
df[((df$RateCodeID == 2) | (df$RateCodeID == 3) | (df$RateCodeID == 4)),]$Tip_amount, var.equal = F)##
## Welch Two Sample t-test
##
## data: df[((df$RateCodeID == 1) | (df$RateCodeID == 5) | (df$RateCodeID == and df[((df$RateCodeID == 2) | (df$RateCodeID == 3) | (df$RateCodeID == 6)), ]$Tip_amount and 4)), ]$Tip_amount
## t = -31.302, df = 6480.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.469428 -3.060481
## sample estimates:
## mean of x mean of y
## 1.221586 4.486540
temp <- with(df,table(df$RateCodeID, df$Payment_type))
rownames(temp) <- c('Standard rate','JFK','Newark','Westchester','Negotiated fare','Group ride')
colnames(temp) <- c('Credit card','Cash','No charge','Dispute','Unknown')
t1 <- ttheme_default(core=list(
fg_params=list(fontface=c(rep("plain", 5), "bold.italic")),
bg_params = list(fill=c(rep(c("grey95", "grey90"),
length.out=5), "#6BAED6"),
alpha = rep(c(1,0.5), each=10))
))
## Cross table of the count of rides w.r.t the payment type and rate code id.
grid.table(temp, theme = t1)temp1 <- plyr::count(df, vars = 'Payment_type')
temp2 <- plyr::count(df[df$Tip_amount == 0,], vars = 'Payment_type')
temp <- merge(temp1,temp2,'Payment_type')
colnames(temp) = c('Payment_type','Tips Recorded', 'Tips (=$0) Recorded')
temp$Payment_type <- c('Credit card','Cash','No charge','Dispute','Unknown')
grid.table(temp, theme=t1)#Converting Trip Duratians to secs
x1 <- strptime(df$lpep_pickup_datetime, "%Y-%m-%d %H:%M:%OS")
x2 <- strptime(df$Lpep_dropoff_datetime, "%Y-%m-%d %H:%M:%OS")
df$trip_duration <- as.numeric(x2-x1,units="secs") #this is a derived feature
#dividing into hours
time.category <- with(df, ifelse(trip_duration <= (4*3600), 1,
ifelse(trip_duration >= 5*3600 & trip_duration <= 24*3600, 2, 3))
)
aggregate(df$trip_duration,by=list(time.category),FUN=length)gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4393476 234.7 7790168 416.1 7790168 416.1
## Vcells 96739499 738.1 290449392 2216.0 453827176 3462.5
df$hour <- as.POSIXlt(anytime(as.factor(df$lpep_pickup_datetime)))$hour
temp <- aggregate(.~ hour, data = df,mean)[,c('hour','Trip_distance','Tip_amount')]
tempggplot(data = temp, aes(x = hour)) +
geom_line(aes(y = Trip_distance, colour = 'Mean Trip Distance')) +
geom_line(aes(y = Tip_amount, colour = 'Mean Tip Amount')) +
theme_bw() +
labs(x = "Hour of the day", y = 'Values') +
scale_colour_manual('Legend ',breaks = c("Mean Trip Distance", "Mean Tip Amount")
,values=c('Mean Trip Distance'="blue",'Mean Tip Amount'="red"))gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4399876 235 7790168 416.1 7790168 416.1
## Vcells 88595980 676 290449392 2216.0 453827176 3462.5
#latitude and longtitude other than the area covered by Green Taxis
temp <- data.frame(df$Pickup_longitude, df$Pickup_latitude)
colnames(temp) = c('lon','lat')
usa_center = as.numeric(geocode("United States"))## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false
USAMap = ggmap(get_googlemap(center=usa_center, scale=2, zoom=4), extent="panel")## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.09024,-95.712891&zoom=4&size=640x640&scale=2&maptype=terrain&sensor=false
USAMap +
geom_point(aes(x=lon, y=lat), data=temp, col="orange", alpha=0.4) ndf <- data.frame(Dropoff_longitude=df$Dropoff_longitude
,Dropoff_latitude=df$Dropoff_latitude
,Pickup_longitude=df$Pickup_longitude
,Pickup_latitude=df$Pickup_latitude)
nw <- list(lat = 40.917577, lon = -74.259090)
se <- list(lat = 40.477399, lon = -73.700272)
ind <- which(df$Dropoff_longitude < nw$lon | df$Dropoff_longitude > se$lon)
ndf$Dropoff_longitude[ind] <- NA
ind <- which(df$Pickup_longitude < nw$lon | df$Pickup_longitude > se$lon)
ndf$Pickup_longitude[ind] <- NA
ind <- which(df$Dropoff_latitude < se$lat | df$Dropoff_latitude > nw$lat)
ndf$Dropoff_latitude[ind] <- NA
ind <- which(df$Pickup_latitude < se$lat | df$Pickup_latitude > nw$lat)
ndf$Pickup_latitude[ind] <- NA
na_count <- sapply(ndf, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count# 0 passenger count
nrow(df[df$Passenger_count == 0,])## [1] 436
# More than 7 passenger count
nrow(df[df$Passenger_count > 8,])## [1] 16
# Fare Amount less than 2.5
nrow(df[df$Fare_amount < 2.50,])## [1] 7455
nrow(df[df$Fare_amount < 0 & df$VendorID == 2,])## [1] 2417
nrow(df[df$Fare_amount < 0 & df$VendorID == 1,])## [1] 0
neg.vars <- c('Fare_amount','Extra','improvement_surcharge','Total_amount','MTA_tax','Tip_amount')
df[df$Fare_amount < 0,][neg.vars] <- df[df$Fare_amount < 0,][neg.vars]*-1
#Removing Fare amount less than 2.5
df <- subset(df,df[,'Fare_amount'] >= 2.5)
# Distances greater then 0
df <- subset(df,df[,11] > 0)
nrow(df)## [1] 1470312
#Trip Durations greater then 4 hrs
df <- subset(df,df[,21] < (4*3600))
#remove trip Durations less then 2min
df <- subset(df,df[,21] > (2*60))
nrow(df)## [1] 1431210
# set coordinates outside of NYC bounding box to NA(reference taken from
# https://www.maptechnica.com/city-map/New%20York/NY/3651000)
nw <- list(lat = 40.917577, lon = -74.259090)
se <- list(lat = 40.477399, lon = -73.700272)
ind <- which(df$Dropoff_longitude < nw$lon | df$Dropoff_longitude > se$lon)
df$Dropoff_longitude[ind] <- NA
ind <- which(df$Pickup_longitude < nw$lon | df$Pickup_longitude > se$lon)
df$Pickup_longitude[ind] <- NA
ind <- which(df$Dropoff_latitude < se$lat | df$Dropoff_latitude > nw$lat)
df$Dropoff_latitude[ind] <- NA
ind <- which(df$Pickup_latitude < se$lat | df$Pickup_latitude > nw$lat)
df$Pickup_latitude[ind] <- NA
nrow(df)## [1] 1431210
df <- df[complete.cases(df),]
nrow(df)## [1] 1427671
# passengers < 7
df <- subset(df,df[,10] < 7)
# Replace passengers with zero count with the median value i.e. 1
df$Passenger_count[df$Passenger_count == 0] <- 1
# Since payment types other than credit card have zero tips 99% of the time
df <- subset(df, Payment_type == 1)
# Convert to non-airport and airport trips
df[df$RateCodeID==1|df$RateCodeID==5|df$RateCodeID==6,]$RateCodeID <- 0
df[df$RateCodeID==2|df$RateCodeID==3|df$RateCodeID==4,]$RateCodeID <- 1nums <- sapply(df, is.numeric) #taking only numeric class
num.df <- df[ , nums]
gc()
#display only tip_percent
corrplot(cor(num.df[,-16])[,18], method = "number",tl.cex = 1,type="lower",diag=FALSE)df$Tip_percent <- (df$Tip_amount/df$Total_amount)*100
clean_datetime <- df %>%
mutate(lpep_pickup_datetime = ymd_hms(lpep_pickup_datetime)) %>%
mutate(Lpep_dropoff_datetime = ymd_hms(Lpep_dropoff_datetime)) %>%
mutate(weekday_pickup = weekdays(lpep_pickup_datetime)) %>%
mutate(weekday_dropoff= weekdays(Lpep_dropoff_datetime))%>%
mutate(hpick = hour(lpep_pickup_datetime)) %>%
mutate(date1 = date(lpep_pickup_datetime))
#from the above code we get derived features such as weekday pickup,hour of pickup
temp <- clean_datetime %>%
group_by(weekday_pickup) %>%
summarize(Count_Trips = n(), avg_dist = mean(Trip_distance),
avg_passengers = mean(Passenger_count),
avg_price = mean(Total_amount),
avg_Tip = mean(Tip_amount),
Total_tip = sum(Tip_amount))
temp[c(2,5,3,4,1,6,7),] <- temp[c(1,2,3,4,5,6,7),]
temp$ratio <- temp$avg_Tip/temp$avg_dist
head(temp,7)ggplot(temp, aes(x = factor(weekday_pickup),group = 1)) +
geom_point(aes(y = avg_Tip), color = 'red') +
geom_line(aes(y = avg_Tip), color = 'blue') +
geom_point(aes(y = avg_dist), color = 'black') +
geom_line(aes(y = avg_dist), color = 'cyan') +
theme_bw() +
xlab('Days of the Week') +
ylab('Avg Tip / Avg Distance') +
annotate("text", x = 2.0, y = 3.62, label = "Average Distance") +
annotate("text", x = 3.1, y = 2.75, label = "Average Tips") +
scale_x_discrete(limits=temp$weekday_pickup)p1 <- ggplot(temp, aes(x = weekday_pickup
,group = 1)) +
geom_point(aes(y = ratio, size = Count_Trips), color = 'yellow') +
geom_line(aes(y = ratio), color = 'green') +
theme_light() +
xlab('Days of the Week') +
ylab('Tip Recieved / Distance Travelled') +
scale_x_discrete(limits=temp$weekday_pickup) +
scale_size_continuous(range = c(5,20)) +
theme(legend.position="right")
ggplotly(p1)temp <- clean_datetime %>%
group_by(hour,weekday_pickup) %>%
summarise(Mean_Tip = mean(Tip_amount))
ggplot(temp, aes(hour, Mean_Tip)) +
geom_point() +
geom_line() +
theme_bw() +
facet_wrap(~factor(weekday_pickup)) +
labs(x = 'Hour of the day', y = 'Mean Tip')ggplot(temp[temp$hour <= 12,], aes(hour, Mean_Tip)) +
geom_point() +
geom_line() +
theme_bw() +
facet_wrap(~factor(weekday_pickup)) +
labs(x = 'Hours before Noon', y = 'Mean Tip', Title = 'Tip recieved before 12pm')ggplot(temp[temp$hour > 12,], aes(hour, Mean_Tip)) +
geom_point() +
geom_line() +
theme_bw() +
facet_wrap(~factor(weekday_pickup)) +
labs(x = 'Hours after Noon', y = 'Mean Tip')df$weekday_num<-as.integer(format(as.Date(df$lpep_pickup_datetime),"%w"))